1 Loading data and packages

library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(UpSetR)
library(ComplexHeatmap)
library(circlize)
library(SingleCellExperiment)
library(org.Hs.eg.db)
library(clusterProfiler)
library(msigdbr)
library(DOSE)
library(BiocParallel)
library(patchwork)
library(Biostrings)

Gene names/synonyms required for databases cleaning

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
                       "external_synonym", "gene_biotype",
                       "chromosome_name", "band", "start_position", "end_position",
                       "strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

ensembl_gene_synonym <- ensembl_gene_synonym %>%
  mutate(external_synonym = sub(pattern = "ORF", external_synonym, 
                                replacement = "orf"))

attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
GTEX_data <- CTdata::GTEX_data()
## Error in readRDS(file) : error reading from connection
normal_tissues_multimapping_data <- CTdata::normal_tissues_multimapping_data()
CCLE_data <- CTdata::CCLE_data()
TCGA_TPM <- CTdata::TCGA_TPM()
DAC_treated_cells_multimapping <- CTdata::DAC_treated_cells_multimapping()
testis_sce <- CTdata::testis_sce()
scRNAseq_HPA <- CTdata::scRNAseq_HPA()
CT_mean_methylation_in_tissues <- CTdata::CT_mean_methylation_in_tissues()

Common figures parameters

legends_param <- list(
  labels_gp = gpar(col = "black", fontsize = 6),
  title_gp = gpar(col = "black", fontsize = 6),
  row_names_gp = gpar(fontsize = 4),
  annotation_name_side = "left")

legend_colors <- c("#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
                   "#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
                   "#9E0142")
chr_colors <- c("X-linked" = "deeppink", "Not X" = "royalblue1")

meth_colors <- c("Methylation" = "lightgreen", "Not methylation" = "gray")

2 Database comparison

2.1 Lists cleaning

CT lists from other databases have been checked (using GTEx and our GTEx_expression() funtion and GeneCards) in order to remove duplicated gene names or deprecated ones and allow comparison between databases.

2.1.1 CTdatabase

Online list copied in a csv file, several lists exist so we combined them.

We checked gene names that were a concetanation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.

CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";", 
                         escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
                          "CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis, 
                        by = c("Gene_Name" = "Gene_Symbol"))


CTdatabase_single <- CTdatabase %>%
  mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
  mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))


CTdatabase_official_names <- 
  unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, 
                       external_gene_name)) %>%
  filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)
CTdatabase_synonym <- 
  ensembl_gene_synonym %>%
  filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <- 
  rbind(CTdatabase_official_names, CTdatabase_synonym) %>% 
  left_join(CTdatabase_single)


duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
  filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
  filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
  pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"


CTdatabase_cleaned <- ensembl_gene_synonym %>%
  mutate(Gene_Name = external_synonym) %>%
  filter(external_synonym == "CXorf61") %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61", 
                    c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
  filter(external_gene_name == "CCNA1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "GOLGA6L2") %>%
  filter(ensembl_gene_id == "ENSG00000174450") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LYPD6B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT62") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT75") %>%
  filter(ensembl_gene_id == "ENSG00000291155") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LINC01192") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "TSPY1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "SSX2B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)

2.1.2 Jamin’s list

Excel file coming from supplemental data.

Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"

2.1.3 Wang’s CTatlas

Excel file coming from supplemental data.

Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx", 
    sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"

Wang_CT <- ensembl_gene_names %>% 
  filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
  right_join(Wang_CT)

Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"
Wang_CT[Wang_CT$external_gene_name == "", "external_gene_name"] <- "RNASE11"
Wang_CT[Wang_CT$external_gene_name == "CHCT1", "external_gene_name"] <- "C17orf64"
Wang_CT[Wang_CT$external_gene_name == "PRSS40A", "external_gene_name"] <- "TISP43"
Wang_CT[Wang_CT$external_gene_name == "TEX56P", "external_gene_name"] <- "C6orf201"
Wang_CT[Wang_CT$external_gene_name == "SLC25A51P4", "external_gene_name"] <- "RP11-113D6.10"
Wang_CT[Wang_CT$external_gene_name == "TCP10L3", "external_gene_name"] <- "TCP10"
Wang_CT[Wang_CT$external_gene_name == "SCAND3", "external_gene_name"] <- "ZBED9"

2.1.4 Carter’s list

Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]

Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"

2.2 CTexploreR data for selection pipeline

To characterise the differences between our database and other, we need the category we created in CTexploreR.

Hereunder is what we used for our selection pipeline (coming from make_CT_list.R in CTdata), mainly how we created the data.

# GTEX data with the tissue specificity category determined
all_genes <- as_tibble(rowData(GTEX_data), rownames = "ensembl_gene_id")
all_genes$TPM_testis <- assay(GTEX_data)[, "Testis"]

# Add multimapping_analysis assessing testis-specificity of genes "lowly_expressed"

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(normal_tissues_multimapping_data), 
                      rownames = "ensembl_gene_id"))

# Summarise both specificity

all_genes <- all_genes %>%
  mutate(testis_specificity = case_when(
    GTEX_category == "testis_specific" |
      multimapping_analysis == "testis_specific" ~ "testis_specific",
    GTEX_category == "testis_preferential" ~ "testis_preferential"))

# Add testis scRNA seq highest expressed cell type

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(testis_sce)) %>%
              dplyr::select(external_gene_name, testis_cell_type))

# Add higher in somatic scRNAseq data of normal tissues from the Human Protein Atlas 

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(scRNAseq_HPA), rownames = "ensembl_gene_id") %>%
              dplyr::select(ensembl_gene_id, external_gene_name,
                            Higher_in_somatic_cell_type))

# CCLE database analysis added

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(CCLE_data), rownames = "ensembl_gene_id"))
all_genes[is.na(all_genes$CCLE_category), "CCLE_category"] <- "not_in_CCLE"


# TCGA database analysis added

all_genes <- all_genes %>%
  left_join(as_tibble(rowData(TCGA_TPM), rownames = "ensembl_gene_id") %>%
              dplyr::select(ensembl_gene_id, external_gene_name, percent_pos_tum,
                            percent_neg_tum, max_TPM_in_TCGA, TCGA_category))

all_genes[all_genes$lowly_expressed_in_GTEX == TRUE &
            all_genes$multimapping_analysis == "testis_specific",
          "TCGA_category"] <- "multimapping_issue"

# Add DAC analysis
induced <- as_tibble(rowData(DAC_treated_cells_multimapping), 
                     rownames = "ensembl_gene_id") %>%
  filter(induced == TRUE) %>%
  pull(external_gene_name)

all_genes <- all_genes %>%
  mutate(DAC_induced = case_when(external_gene_name %in% induced ~ TRUE,
                                 !external_gene_name %in% induced ~ FALSE))

# Add the most biologically relevant transcript (canonical and coordinates)

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id",
                       "external_gene_name",
                       "ensembl_transcript_id",
                       "external_transcript_name",
                       "chromosome_name",
                       "strand",
                       "transcript_start",
                       "transcript_end",
                       "transcription_start_site",
                       "transcript_length",
                       "transcript_biotype",
                       "transcript_is_canonical")
transcripts_infos <- as_tibble(biomaRt::getBM(attributes = attributes_vector,
                                              mart = ensembl))
canonical_transcripts <- transcripts_infos %>%
  filter(transcript_is_canonical == 1) %>%
  filter(chromosome_name %in% c(1:22, "X", "Y", "MT")) %>%
  filter(transcript_biotype == "protein_coding" | transcript_biotype == "lncRNA")
all_genes <- all_genes %>%
  left_join(canonical_transcripts %>%
              dplyr::select(ensembl_gene_id,
                            external_transcript_name, ensembl_transcript_id,
                            chromosome_name, strand, transcript_start,
                            transcript_end, transcription_start_site,
                            transcript_length, transcript_biotype))

From there, we filtered based on the testis_specificity (“testis_specific” or “testis_preferential”), testis_cell_type (not “Macrophage”, “Endothelial”, “Myoid”, “Sertoli”, “Leydig”), Higher_in_somatic_cell_type (TRUE), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then the most relevant transcript was validated manually (and check that there is a transcript activated in testis and the same is activated in tumor).

2.3 CTexploreR VS CTdatabase

CTdatabase_ours <- Venn(list(CTdatabase = CTdatabase_cleaned$external_gene_name,
                             CTexploreR = CT_genes$external_gene_name))
gp <- VennThemes(compute.Venn(CTdatabase_ours))
gp[["Face"]][["11"]]$fill <-  "mistyrose"
gp[["Face"]][["01"]]$fill <-  "darkseagreen1"
gp[["Face"]][["10"]]$fill <-  "lightsteelblue1"
gp[["Set"]][["Set1"]]$col <-  "paleturquoise4"
gp[["Set"]][["Set2"]]$col <-  "darkseagreen4"
gp[["SetText"]][["Set1"]]$col <-  "paleturquoise4"
gp[["SetText"]][["Set2"]]$col <-  "darkseagreen4"
plot(CTdatabase_ours, gp = gp)

Lost genes analysis

CTdatabase_lost <- all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]])

# 10 Genes are lost because not in any  database


CTdatabase_lost[is.na(CTdatabase_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(CTdatabase_lost$testis_specificity)
## 
##        not_specific testis_preferential     testis_specific 
##                  85                  14                  38
table(CTdatabase_lost$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte              Leydig 
##                  15                  14                  15                   2 
##     Round_spermatid             Sertoli              Sperm1              Sperm2 
##                  39                   2                   5                   9 
##       Spermatogonia                 SSC 
##                   2                  11
table(CTdatabase_lost$Higher_in_somatic_cell_type)
## 
## FALSE  TRUE 
##    95    28
table(CTdatabase_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                 48                 46                  7                 36
table(CTdatabase_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            54            32            51

2.4 CTexploreR VS omics databases

core_ours <- Venn(list(Jamin = Jamin_core_CT$Gene, 
                       CTexploreR = CT_genes$external_gene_name))

Wang_ours <- Venn(list(Wang = Wang_CT$external_gene_name, 
                       CTexploreR = CT_genes$external_gene_name))

Carter_ours <- Venn(list(Carter_CT = Carter_CT$Gene_Name, 
                         CTexploreR = CT_genes$external_gene_name))
gene_list <- list(Jamin = Jamin_core_CT$Gene, 
                  CTdatabase = CTdatabase_cleaned$external_gene_name, 
                  CTatlas = Wang_CT$external_gene_name, 
                  CTexploreR = CT_genes$external_gene_name, 
                  Carter = Carter_CT$Gene_Name)

upset_omics <- fromList(gene_list[-2])
upset(upset_omics)

Lost genes analysis

Jamin_lost <- all_genes %>%
  filter(external_gene_name %in% core_ours@IntersectionSets[["10"]])

Jamin_lost[is.na(Jamin_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Jamin_lost$testis_specificity)
## 
##        not_specific testis_preferential     testis_specific 
##                  78                   5                   2
table(Jamin_lost$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid         Endothelial   Late_spermatocyte 
##                  10                   9                   1                  13 
##          Macrophage     Round_spermatid             Sertoli              Sperm1 
##                   1                  22                   2                   4 
##              Sperm2       Spermatogonia                 SSC 
##                   4                   4                   4
table(Jamin_lost$Higher_in_somatic_cell_type)
## 
## FALSE  TRUE 
##    60    22
table(Jamin_lost$TCGA_category)
## 
##     activated         leaky not_activated 
##            15            56            14
table(Jamin_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            39            42             4
Wang_lost <- all_genes %>%
  filter(external_gene_name %in% Wang_ours@IntersectionSets[["10"]])

Wang_lost[is.na(Wang_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Wang_lost$testis_specificity)
## 
##        not_specific testis_preferential     testis_specific 
##                 470                 109                 249
table(Wang_lost$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid         Endothelial   Late_spermatocyte 
##                  69                 137                   1                  76 
##              Leydig               Myoid     Round_spermatid             Sertoli 
##                   1                   1                 308                  18 
##              Sperm1              Sperm2       Spermatogonia                 SSC 
##                  41                  69                  22                  43
table(Wang_lost$Higher_in_somatic_cell_type)
## 
## FALSE  TRUE 
##   734    66
table(Wang_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                302                302                  5                219
table(Wang_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##           256           206           366
Carter_lost <- all_genes %>%
  filter(external_gene_name %in% Carter_ours@IntersectionSets[["10"]])

Carter_lost[is.na(Carter_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Carter_lost$testis_specificity)
## 
##        not_specific testis_preferential     testis_specific 
##                   1                   8                  42
table(Carter_lost$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte     Round_spermatid 
##                   4                  16                   3                  20 
##             Sertoli              Sperm1       Spermatogonia                 SSC 
##                   2                   1                   1                   2
table(Carter_lost$Higher_in_somatic_cell_type)
## 
## FALSE  TRUE 
##    48     3
table(Carter_lost$TCGA_category)
## 
##     activated         leaky not_activated 
##            23             9            19
table(Carter_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            10             3            38

2.5 Characterisation of differences with all databases

common <- unique(c(core_ours@IntersectionSets[["11"]], 
                   CTdatabase_ours@IntersectionSets[["11"]], 
                   Wang_ours@IntersectionSets[["11"]], 
                   Carter_ours@IntersectionSets[["11"]]))

length(common)
## [1] 212
length(common)/dim(CT_genes)[1] * 100
## [1] 71.14094
lost_list <- unique(c(core_ours@IntersectionSets[["10"]], 
                 CTdatabase_ours@IntersectionSets[["10"]],
                 Wang_ours@IntersectionSets[["10"]],
                 Carter_ours@IntersectionSets[["10"]]))

lost <- all_genes %>%
  filter(external_gene_name %in% lost_list)

not_specific <- filter(lost, is.na(testis_specificity))

GTEX_expression(not_specific$external_gene_name, units = "log_TPM")

somatic_testis <- filter(lost, testis_cell_type %in% c( "Macrophage", 
                                                        "Endothelial", "Myoid",
                                                        "Sertoli", "Leydig"))

testis_expression(somatic_testis$external_gene_name, cells = "all")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

somatic_tissue <- filter(lost, Higher_in_somatic_cell_type == TRUE)

HPA_cell_type_expression(somatic_tissue$external_gene_name)

not_TCGA_activated <- filter(lost, TCGA_category != "activated" & 
                               TCGA_category != "multimapping_issue")

TCGA_expression(not_TCGA_activated$external_gene_name,
                tumor = "all",
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

not_CCLE_activated <- filter(lost, CCLE_category  != "activated")

CCLE_expression(not_CCLE_activated$external_gene_name,
                  type = c("lung", "skin", "colorectal",
                           "gastric", "breast", "head_and_neck"),
                units = "log_TPM")

transcript_prob <- lost %>% 
  filter(testis_specificity == "testis_specific" |
           testis_specificity == "testis_preferential") %>%
  filter(TCGA_category == "activated" | TCGA_category == "multimapping_issue") %>% 
  filter(CCLE_category == "activated") %>% 
  dim()

We have lost 979 genes in total. Among them, 58.0183861% are not considered testis specific, 2.8600613% are expressed in testis somatic cells, 11.13381% are expressed in somatic cells, 62.206333% are not activated in TCGA samples, 66.4964249% are not activated in CCLE cell lines and 1.9407559% is lost due to transcripts problems.

What about new genes in CTexploreR

new <- CT_genes %>% 
  filter(!external_gene_name%in%common)

new
table(new$testis_specificity)
## 
## testis_preferential     testis_specific 
##                  24                  62
table(new$X_linked, new$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE    50   24
##   TRUE      1   11
TCGA_expression(tumor = "all", genes = new$external_gene_name, 
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

TCGA_expression(tumor = "all", 
                genes = filter(new, X_linked & regulated_by_methylation)$external_gene_name, 
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

There are 86 new CT genes in CTexploreR. These are mainly testis specific and on autosomes. Regulation by methylation is not the majority of them. This makes sens as other databases have been focused on regulation by methylation, so as here we don’t segregate with that we can find these. There is only 11 new “major” CT that are on the X chromosome and regulated by methylation. CT45 are not that new.

Expression in tumours doesn’t strike that much.

2.6 Other databases : minor analysis

There are ocassionnally other CT genes list that exist in litterature but there is not always details on the pipeline, the way it is maintained or handled. There is usually a lack of structure.

2.6.1 Bruggeman 2018

They started with the fact that most CT genes have been identified using whole testis, this thus don’t make them germline specific and diminishes their interest as targets for cancer immunotherapy. Their aim was thus to identify true germ-cell specific cancer genes. They thus used transcriptomes of specific germ cell-types present in healthy human spermatogenesis and compared it to data from GTEx and the TCGA. This allowed them to identify 756 germ cell specific genes that are widely expressed in cancer. They state that “they’re certain that their genes are true GC-genes”. However, they don’t validate the known enrichment in CT genes as their genes are distributed evenly across all chromosomes

The human male germ cells transcriptome they use comes from tissue samples from 6 males, cell subtypes were isolated using a laser capture microdissection before RNA sequencing.

Data was available through a web application that gives now an error, and is also in an excel supplemental file.

Bruggeman_CT <- read_excel("data/Bruggeman_suppl_data.xlsx", skip = 1,
                           sheet = "1D")
## New names:
## • `HUGO` -> `HUGO...7`
## • `HUGO` -> `HUGO...14`
GTEX_expression(Bruggeman_CT$Gene, units = "log_TPM")
## Warning: 76 out of 756 names invalid: ADAM6, CTAGE5, C21orf59, C11orf57,
## LRRC37BP1, LYRM5, C7orf55, KIAA0391, CASC5, KIAA1524, PA2G4P4,
## C10orf12, PIPSL, FAM175A, MTL5, RPL19P12, SGOL2, STAG3L1, KIAA1919,
## KIAA2022, FAM188B, C16orf59, TTTY15, C17orf53, YBX3P1, TMEM194B,
## ZNF788, MSL3P1, ZNF702P, WDR92, LINC00476, UQCRBP1, ATP5EP2, RPL23P8,
## DPY19L2P2, ARHGAP11B, SGOL1, CCDC173, RPL23AP53, FAM19A3, C12orf66,
## PPIEL, C9orf84, C10orf25, WDR78, C20orf197, PRKY, GUSBP2, ATP6AP1L,
## FAM122C, BMS1P4, MRS2P2, ANXA2P3, C3orf67, PLEKHA8P1, C8orf37, HLA-L,
## HILS1, RPLP0P2, FAM86JP, ZNRD1-AS1, ZNF876P, TMEM14E, LACE1, SEC14L1P1,
## PIN4P1, C22orf34, RPL21P44, TCAM1P, FAM35DP, METTL20, PPP1R2P3,
## SLC6A10P, TCTE3, ZNF815P, MLLT4-AS1.
## See the manual page for valid types.

TCGA_expression(Bruggeman_CT$Gene, units = "log_TPM", tumor = "all")
## Warning: 76 out of 756 names invalid: ADAM6, CTAGE5, C21orf59, C11orf57,
## LRRC37BP1, LYRM5, C7orf55, KIAA0391, CASC5, KIAA1524, PA2G4P4,
## C10orf12, PIPSL, FAM175A, MTL5, RPL19P12, SGOL2, STAG3L1, KIAA1919,
## KIAA2022, FAM188B, C16orf59, TTTY15, C17orf53, YBX3P1, TMEM194B,
## ZNF788, MSL3P1, ZNF702P, WDR92, LINC00476, UQCRBP1, ATP5EP2, RPL23P8,
## DPY19L2P2, ARHGAP11B, SGOL1, CCDC173, RPL23AP53, FAM19A3, C12orf66,
## PPIEL, C9orf84, C10orf25, WDR78, C20orf197, PRKY, GUSBP2, ATP6AP1L,
## FAM122C, BMS1P4, MRS2P2, ANXA2P3, C3orf67, PLEKHA8P1, C8orf37, HLA-L,
## HILS1, RPLP0P2, FAM86JP, ZNRD1-AS1, ZNF876P, TMEM14E, LACE1, SEC14L1P1,
## PIN4P1, C22orf34, RPL21P44, TCAM1P, FAM35DP, METTL20, PPP1R2P3,
## SLC6A10P, TCTE3, ZNF815P, MLLT4-AS1.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

testis_expression(Bruggeman_CT$Gene, cells = "all")
## Warning: 88 out of 756 names invalid: ADAM6, CTAGE5, ADORA2A, C21orf59,
## C11orf57, LRRC37BP1, FOXO3B, LYRM5, CBWD1, C7orf55, KIAA0391, CASC5,
## KIAA1524, PA2G4P4, C10orf12, PIPSL, FAM175A, MTL5, RPL19P12, SGOL2,
## STAG3L1, KIAA1919, KIAA2022, FAM188B, C16orf59, PCDHGB2, TTTY15,
## C17orf53, YBX3P1, F2RL2, TMEM194B, MIR17HG, ZNF788, MSL3P1, ZNF702P,
## MPL, WDR92, LINC00476, UQCRBP1, ATP5EP2, RPL23P8, DPY19L2P2, SGOL1,
## CCDC173, RPL23AP53, CBWD6, FAM19A3, NLRP9, C12orf66, EDAR, PPIEL,
## C9orf84, C10orf25, WDR78, C20orf197, PRKY, GUSBP2, ATP6AP1L, FAM122C,
## BMS1P4, MRS2P2, ANXA2P3, C3orf67, PLEKHA8P1, C8orf37, HLA-L, HILS1,
## RPLP0P2, FAM86JP, ZNRD1-AS1, ZNF876P, TMEM14E, LACE1, SEC14L1P1,
## PCDHA6, PIN4P1, C22orf34, RPL21P44, TCAM1P, FAM35DP, METTL20, ASB14,
## PPP1R2P3, SLC6A10P, TCTE3, GK3P, ZNF815P, MLLT4-AS1.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

76 out of the 756 gene names aren’t found in GTEx, and our data.

GTEX expression : There is clearly a lack of testis specificty. Expression in lymphocytes and fibroblasts is not surprising as they excluded it from their analysis.

TCGA expression : Most genes seems to be expressed in all tumours, which is the reflection of an overall expression and not an activation.

Testis cells expression : expression is indeed mainly in germ cells, even if some genes are expressed in somatic cells. However, lots of genes are just not expressed in any of these cell types.

Bruggeman_ours <- Venn(list(Bruggeman = Bruggeman_CT$Gene,
                            CTexploreR = CT_genes$external_gene_name))
gp <- VennThemes(compute.Venn(Bruggeman_ours))
gp[["Face"]][["11"]]$fill <-  "mistyrose"
gp[["Face"]][["01"]]$fill <-  "darkseagreen1"
gp[["Face"]][["10"]]$fill <-  "lightsteelblue1"
gp[["Set"]][["Set1"]]$col <-  "paleturquoise4"
gp[["Set"]][["Set2"]]$col <-  "darkseagreen4"
gp[["SetText"]][["Set1"]]$col <-  "paleturquoise4"
gp[["SetText"]][["Set2"]]$col <-  "darkseagreen4"
plot(Bruggeman_ours, gp = gp)

Bruggeman_others <- Venn(list(Bruggeman = Bruggeman_CT$Gene,
                            Wang = Wang_CT$external_gene_name,
                            CTdatabase = CTdatabase_cleaned$external_gene_name))
plot(Bruggeman_others)

Despite the size of Bruggeman’s list, there is a very small amount of genes in common with CTexploreR. This is not surprising seeing the patterns of expression in normal and tumoral tissues above. I’ll clarify the difference below.

I however manage to recreate the Venn diagram of their fig 2, with slight differences probably due to the cleaning of gene names I have done in Wang and CTdatabase but not in Bruggeman.

Bruggeman_lost <- all_genes %>%
  filter(external_gene_name %in% Bruggeman_ours@IntersectionSets[["10"]])
# Missing the 76 genes

Bruggeman_lost[is.na(Bruggeman_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Bruggeman_lost$testis_specificity)
## 
##        not_specific testis_preferential     testis_specific 
##                 619                  18                  13
table(Bruggeman_lost$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid         Endothelial   Late_spermatocyte 
##                  99                  82                  10                  70 
##              Leydig          Macrophage               Myoid     Round_spermatid 
##                   3                   8                  13                 143 
##             Sertoli              Sperm1              Sperm2       Spermatogonia 
##                  27                   7                  14                  65 
##                 SSC 
##                  84
table(Bruggeman_lost$Higher_in_somatic_cell_type)
## 
## FALSE  TRUE 
##   307   322
table(Bruggeman_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                134                493                  1                 22
table(Bruggeman_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##           218           404            28

It is clear here that most genes are not testis specific. Even if they are indeed mostly expressed in germ cells, they are not specific to it as they are not testis specific. We can also see the small amount of genes activated in TCGA samples and CCLE cell lines.

3 CT genes characteristics

table(CT_genes$testis_specificity)
## 
## testis_preferential     testis_specific 
##                  92                 206
table(CT_genes$transcript_biotype)
## 
##                  lncRNA nonsense_mediated_decay          protein_coding 
##                      48                       3                     247

Most genes are testis specific (69.1275168%). Most genes are mainly protein coding genes (82.885906%).

3.1 X chromosome and regulated by methylation

In CTexploreR, genes have been characterised as regulated by methylation or not.

table(CT_genes$X_linked)
## 
## FALSE  TRUE 
##   205    93
table(CT_genes$regulated_by_methylation)
## 
## FALSE  TRUE 
##   138   160
table(CT_genes$X_linked, CT_genes$testis_specificity)
##        
##         testis_preferential testis_specific
##   FALSE                  86             119
##   TRUE                    6              87
table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE   128   77
##   TRUE     10   83

Genes are enriched on the X chromosome (31.2080537%). Also, 160 genes have been flagged as regulated by methylation (53.6912752%). It is interesting to study the link between these two characteristics.

On the chromosome X, there is a clear enrichment of CT genes regulated by methylation (83/93 chrX genes or 83/160 genes regulated by methylation).

Let’s check that with a statistical test, I here want to see if, on there is an enrichment of genes regulated by methylation on the X chromosome. I need to do a Pearson Chi square test (to know if observed proportion differ from expected proportion). It is a statistical method to determine if two categorical variables have a significant correlation between them. I can directly put a matrix (my table) in the function

chisq.test(table(CT_genes$X_linked, CT_genes$regulated_by_methylation))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
## X-squared = 66.676, df = 1, p-value = 3.2e-16
CT_genes$chr_factor <- factor(CT_genes$chr,
                       levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
                                  "10", "11", "12", "13", "14", "15", "16", "17", 
                                  "18", "19", "20", "21", "22", "X", "Y"))

CT_genes %>% 
  mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
                                           "Regulated by methylation",
                                           "Not regulated by methylation")) %>% 
  ggplot(aes(x = chr_factor, fill = testis_specificity, color = X_linked)) +
  geom_bar(stat = 'count') +
  scale_fill_manual(values = c("goldenrod1", "mediumpurple1")) +
  scale_color_manual(values = c("royalblue1", "deeppink"), guide = "none") +
  facet_wrap(~ regulated_by_methylation, ncol = 2) +
  theme_bw() +
  xlab("Chromosome") +
  ylab("Number of genes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "top",
        axis.title = element_text(size = 10, color = "gray10"))

There is indeed a significative link between regulation by methylation and being on the X chromosome. There is thus an enrichment of CT genes regulated by methylation on the X chromosome (and inversely).

3.2 Tumour and methylation

Using CTexploreR functions, we can explore all CT genes or focus on some potential targets.

3.2.1 All CT

For these heatmaps, the code comes from the function but has been copies to add some annotations.

chr_mat <- as.matrix(CT_genes$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes$external_gene_name
row_ha_chr <- rowAnnotation(chr_factor = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr_factor = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")



regulation_mat <- as.matrix(CT_genes$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes$regulated_by_methylation, CT_genes$X_linked)
database <- TCGA_TPM
database$tumor <- sub(pattern = 'TCGA-', x = database$project_id, '')
database$type <- "Tumor"
database$type[database$shortLetterCode == "NT"] <- "Peritumoral"
database <- database[, order(database$tumor, database$type)]

genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]
database <- database[, database$type == "Tumor"]

column_ha_tumor <- HeatmapAnnotation(
  Tumor = database$tumor,
  border = TRUE,
  col = list(Tumor = c("BRCA" = "midnightblue", "COAD" = "darkorchid2",
                 "ESCA" = "gold", "HNSC" = "deeppink2",
                 "LUAD" = "seagreen", "LUSC" = "seagreen3",
                 "SKCM" = "red3")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

split_by <- factor(database$tumor)
col_annot <- column_ha_tumor

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name


Heatmap(mat[, , drop = FALSE],
        name = "logTPM",
        column_title = paste0("Expression in TCGA samples (all)"),
        column_split = split_by,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                         legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = col_annot,
        left_annotation = left_annot)

database <- CCLE_data
database$type <- tolower(database$type)
genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name

df_col <- data.frame("cell_line" = colData(database)$cell_line_name,
                     "type" = as.factor(colData(database)$type))
rownames(df_col) <- rownames(colData(database))
df_col <- df_col[order(df_col$type), ]

column_ha_type <- HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param,
  col = list(type = c("lung" = "seagreen3", "skin" = "red3",
                 "bile_duct" = "mediumpurple1", "bladder" = "mistyrose2",
                 "colorectal" = "plum", "lymphoma" = "steelblue1",
                 "uterine" = "darkorange4", "myeloma" = "turquoise3", 
                 "kidney" = "thistle4",
                 "pancreatic" = "darkmagenta", "brain" = "palegreen2",
                 "gastric" = "wheat3", "breast" = "midnightblue",
                 "bone" = "sienna1", "head_and_neck" = "deeppink2",
                 "ovarian" = "tan3", "sarcoma" = "lightcoral",
                 "leukemia" = "steelblue4", "esophageal"= "khaki",
                 "neuroblastoma" = "olivedrab1")))



Heatmap(mat[, rownames(df_col), drop = FALSE],
        name = "logTPM",
        column_title = "Gene Expression in tumor cell lines (CCLE)",
        column_split = factor(df_col$type),
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                          legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_row_dend = FALSE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = c(column_ha_type),
        left_annotation = left_annot)

Next graph was removed from paper

genes <- CT_genes$external_gene_name
database <- CT_mean_methylation_in_tissues[rownames(CT_mean_methylation_in_tissues) %in% genes]

mat <- na.omit(assay(database))
clustering_option <- TRUE

row_ha_chr_meth <- rowAnnotation(chr = chr_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


row_ha_reg_meth <- rowAnnotation(regulation = regulation_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot_meth <- c(row_ha_chr_meth, row_ha_reg_meth, gap = unit(1, "mm"))

split_meth <- data.frame(filter(CT_genes, external_gene_name %in% rownames(mat))$regulated_by_methylation, 
                    filter(CT_genes, external_gene_name %in% rownames(mat))$X_linked)

Heatmap(mat,
        column_title = 'Promoter mean methylation level by tissue',
        name = 'Meth',
        col = colorRamp2(c(1:100),
                         colorRampPalette(c("moccasin","dodgerblue4"))(100)),
        na_col = "gray80",
        cluster_rows = clustering_option,
        cluster_columns = FALSE,
        row_split = split_meth,
        row_title_gp = gpar(fontsize = 0),
        show_row_names = TRUE,
        show_heatmap_legend = TRUE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 3),
        column_names_gp = gpar(fontsize = 8),
        column_names_side = "bottom",
        row_names_side = "right",
        left_annotation = left_annot_meth)

3.2.2 Coactivation in tumours

We know that CT genes that are regulated by methylation are activated in tumours due to methylation. However, those that are not it is more blurry. It is interesting to look at their expression to see if there are groups of genes that are activated together in tumours which would suggest another activation : a transcription factor ? a gene expresion program from meiosis ? timing of expression ? function ?

not_regulated <- filter(CT_genes, !regulated_by_methylation)$external_gene_name

As they can blur the clustering, I sometimes remove genes that are in the background and those that are expressed a lot.

  • BRCA :
TCGA_expression(filter(CT_genes, !regulated_by_methylation)$external_gene_name,
                tumor = "BRCA", units = "log_TPM")

# Removing genes to see clearer

BRCA_genes <- rowData(TCGA_TPM) %>% 
  as_tibble() %>% 
  filter(external_gene_name %in% not_regulated) %>% 
  filter(percent_pos_BRCA > 0.01 & percent_pos_BRCA < 30) %>% 
  pull(external_gene_name)


BRCA_genes_ordered <- 
  BRCA_genes[row_order(draw(TCGA_expression(BRCA_genes, tumor = "BRCA", 
                                            units = "log_TPM")))]

BRCA_cluster1 <- BRCA_genes_ordered[c(which(BRCA_genes_ordered == "AKAP14"):
                       which(BRCA_genes_ordered == "C11orf97"))]
BRCA_cluster2 <- BRCA_genes_ordered[c(which(BRCA_genes_ordered == "GTF2I-AS1"):
                       which(BRCA_genes_ordered == "SLIT3-AS1"))]

We can see two small clusters.

  • LUAD :
TCGA_expression(filter(CT_genes, !regulated_by_methylation)$external_gene_name,
                tumor = "LUAD", units = "log_TPM")

# Removing genes to see clearer
LUAD_genes <- rowData(TCGA_TPM) %>% 
  as_tibble() %>% 
  filter(external_gene_name %in% not_regulated) %>% 
  filter(percent_pos_LUAD > 0.01 & percent_pos_LUAD < 30) %>% 
  pull(external_gene_name)


LUAD_genes_ordered <- 
  LUAD_genes[row_order(draw(TCGA_expression(LUAD_genes, tumor = "LUAD", 
                                            units = "log_TPM")))]

LUAD_cluster1 <- c("TSPY1", "TSPY2")
LUAD_cluster2 <- c("SPANXC", "SPANXB1",  "SPANXD")
LUAD_cluster3 <- LUAD_genes_ordered[c(1:22)]
LUAD_cluster4 <- LUAD_genes_ordered[c(which(LUAD_genes_ordered == "GTF2I-AS1"):
                       which(LUAD_genes_ordered == "SLIT3-AS1"))]

22 first genes are clearly quite clustering, there is also a small SPANX cluster, a small TSPY cluster ans a bigger one around the middle of the graph.

  • LUSC :
TCGA_expression(filter(CT_genes, !regulated_by_methylation)$external_gene_name,
                tumor = "LUSC", units = "log_TPM")

# Removing genes to see clearer

LUSC_genes <- rowData(TCGA_TPM) %>% 
  as_tibble() %>% 
  filter(external_gene_name %in% not_regulated) %>% 
  filter(percent_pos_LUSC > 0.01 & percent_pos_LUSC < 30) %>% 
  pull(external_gene_name)

LUSC_genes_ordered <- 
  LUSC_genes[row_order(draw(TCGA_expression(LUSC_genes, tumor = "LUSC", 
                                            units = "log_TPM")))]

LUSC_cluster1 <- c("TSPY1", "TSPY2",  "TSPY9", "TSPY10")
LUSC_cluster2 <- c("SPANXC", "SPANXB1",  "SPANXD")
LUSC_cluster3 <- LUSC_genes_ordered[c(1:15)]

We can see a cluster at the top and two smaller ones corresponding to SPANX and TSPY genes. As well as quite a big cluster with the 15 first lines

  • SKCM :
SKCM_genes <- rowData(TCGA_TPM) %>% 
  as_tibble() %>% 
  filter(external_gene_name %in% not_regulated) %>% 
  filter(percent_pos_SKCM > 0.01 & percent_pos_SKCM < 30) %>% 
  pull(external_gene_name)

TCGA_expression(filter(CT_genes, !regulated_by_methylation)$external_gene_name,
                tumor = "SKCM", units = "log_TPM")

# Lots of genes activated almost everywhere or nowhere that can mask clusters
TCGA_expression(SKCM_genes,
                tumor = "SKCM", units = "log_TPM")

SKCM_cluster <- c("TSPY1", "TSPY2", "TSPY3", "TSPY9", "TSPY10")

It is clear that there is a TSPY cluster again.

  • Comparison

TSPY and SPANX genes are found reactivated/coactivated in different types of tumours. Let’s compare the bigger groups found in LUAD, LUSC and BRCA

upset(fromList(list(LUSC = LUSC_cluster3,
               LUAD1 = LUAD_cluster3,
               LUAD2 = LUAD_cluster4,
               BRCA1 = BRCA_cluster1,
               BRCA2 = BRCA_cluster2)))

There is quite a lot of similarities here between the clusters. Let’s look at the expression in tumours.

TCGA_expression(c(LUSC_cluster3, LUAD_cluster3, LUAD_cluster4, BRCA_cluster1,
                  BRCA_cluster2),
                tumor = "all", units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

Let’s have a look at the timings of expression.

testis_expression(c("TSPY1", "TSPY2", "TSPY3", "TSPY9", "TSPY10"), 
                  cells = "all")

testis_expression(c("SPANXC", "SPANXB1",  "SPANXD"), 
                  cells = "all")

testis_expression(LUSC_cluster3, cells = "all")

testis_expression(LUAD_cluster3, cells = "all")

testis_expression(LUAD_cluster4, cells = "all")
## Warning: 4 out of 6 names invalid: GTF2I-AS1, NFIA-AS1, CCDC196, SLIT3-AS1.
## See the manual page for valid types.

testis_expression(BRCA_cluster1, cells = "all")

testis_expression(BRCA_cluster2, cells = "all")
## Warning: 3 out of 8 names invalid: GTF2I-AS1, NFIA-AS1, SLIT3-AS1.
## See the manual page for valid types.

They mainly seem to, in a cluster, be expressed at the same time.

3.2.3 MAGE genes

MAGE_genes <- filter(CT_genes, family == "MAGE")$external_gene_name

TCGA_expression(tumor = "all", 
                genes = MAGE_genes,
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

CCLE_expression(genes = MAGE_genes,
                 type = c("lung", "skin", "bile_duct", "bladder", 
                          "colorectal", "lymphoma", "uterine",
                          "myeloma", "kidney", "pancreatic", "brain", 
                          "gastric", "breast", "bone", "head_and_neck", 
                          "ovarian", "sarcoma", "leukemia", "esophageal",
                          "neuroblastoma"), units = "log_TPM")

normal_tissues_mean_methylation(MAGE_genes, na.omit = TRUE)

3.3 Single Cell analysis : timing of expression

In CTexploreR, we there is single cell data from the testis, we thus can analyse CT genes expression during spermatogenesis.

There is only data for 251 of our 298 CT genes.

NB : SSC, Spermatogonia and early spermatocytes are premeiotic cells. Late spermatocytes (between both meiosis), round spermatid, elongated spermatid and sperm are postmeiotic cells.

genes_avail <- 
  CT_genes$external_gene_name[CT_genes$external_gene_name %in% unique(rownames(testis_sce))]
table(CT_genes$testis_cell_type)
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte     Round_spermatid 
##                  41                  25                  23                  64 
##              Sperm1              Sperm2       Spermatogonia                 SSC 
##                  12                  14                  29                  33
table(CT_genes$testis_cell_type)/length(genes_avail)*100
## 
##  Early_spermatocyte Elongated_spermatid   Late_spermatocyte     Round_spermatid 
##           16.334661            9.960159            9.163347           25.498008 
##              Sperm1              Sperm2       Spermatogonia                 SSC 
##            4.780876            5.577689           11.553785           13.147410

41% of genes are mainly expressed pre-meioticly.

germ_cells <- c("SSC", "Spermatogonia", "Early_spermatocyte",
                "Late_spermatocyte","Round_spermatid", "Elongated_spermatid",
                "Sperm1", "Sperm2")
somatic_cells <- c("Macrophage", "Endothelial", "Myoid", "Sertoli", "Leydig")

testis_sce_CT <- testis_sce[genes_avail, ]
  
mat <- SingleCellExperiment::logcounts(testis_sce_CT)

df_col <- data.frame(clusters = colData(testis_sce_CT)$clusters,
                     type = colData(testis_sce_CT)$type,
                     Donor = colData(testis_sce_CT)$Donor)
rownames(df_col) <- colnames(testis_sce_CT)

df_col <- df_col[order(df_col$type),]
df_col$lineage <- "Germ cells"
df_col$lineage[df_col$type %in% somatic_cells] <- "Somatic cells"
    
column_ha_type = HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  col = list(type = c("SSC" = "floralwhite", "Spermatogonia" = "moccasin",
                   "Early_spermatocyte" = "gold", 
                   "Late_spermatocyte" = "orange",
                   "Round_spermatid" = "red2", 
                   "Elongated_spermatid" = "darkred",
                   "Sperm1" = "violet", "Sperm2" = "purple", 
                   "Sertoli" = "gray", 
                   "Leydig" = "cadetblue2", "Myoid" = "springgreen3", 
                   "Macrophage" = "gray10",
                   "Endothelial" = "steelblue")),
  
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)
    
column_ha_lineage = HeatmapAnnotation(
  lineage = df_col$lineage,
  border = TRUE,
  col = list(lineage = c("Germ cells" = "salmon", "Somatic cells" = "cyan4")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

scale_lims <- c(0, quantile(rowMax(mat), 0.75))
top_annot <- c(column_ha_lineage, column_ha_type)

# Until here is what's in the function, hereunder is my addition/change in Heatmap()

CT_genes_avail <- filter(CT_genes, external_gene_name %in% genes_avail)

chr_mat <- as.matrix(CT_genes_avail$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes_avail$external_gene_name
row_ha_chr <- rowAnnotation(chr = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


regulation_mat <- as.matrix(CT_genes_avail$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes_avail$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes_avail$regulated_by_methylation, CT_genes_avail$X_linked)
    
Heatmap(mat[genes_avail, rownames(df_col), drop = FALSE],
        name = "logCounts",
        column_title = "Expression in testis cells (scRNAseq)",
        column_split = df_col$type,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        show_column_names = FALSE,
        show_column_dend = FALSE,
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        cluster_columns = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        col = colorRamp2(seq(scale_lims[1], scale_lims[2], length = 11),                 
                         legend_colors),
        top_annotation = top_annot,
        left_annotation = left_annot,
        heatmap_legend_param = legends_param)

We used these data to determine in which testis cell type each gene is mostly expressed.

CT_genes$main_cell_type_expression <- factor(CT_genes$testis_cell_type,
                                                levels = c("SSC", 
                                               "Spermatogonia", 
                                               "Early_spermatocyte",
                                               "Late_spermatocyte",
                                               "Round_spermatid",
                                               "Elongated_spermatid",
                                               "Sperm1",
                                               "Sperm2",
                                               "Sertoli"))

CT_genes %>% 
  mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
                                           "Regulated by methylation",
                                           "Not regulated by methylation")) %>%
  filter(!is.na(testis_cell_type)) %>%
  filter(main_cell_type_expression != "Sertoli") %>% 
  ggplot(aes(x = chr, fill = main_cell_type_expression,
             color = main_cell_type_expression)) +
  scale_fill_manual(values = c("lightyellow2", "moccasin", "gold", "orange","red2",
                               "darkred", "violet", "purple")) +
  scale_color_manual(values = c("lightyellow3", "navajowhite2", "gold", "orange","red2",
                               "darkred", "violet", "purple")) +
  geom_bar(stat = 'count') +
  facet_grid(main_cell_type_expression ~ regulated_by_methylation) +
  xlab("Chromosome") +
  ylab("Number of genes") +
  theme_bw() +
  theme(
    axis.text.x = element_text(color = "black", angle = 90, vjust = 0.5, size = 10),
    axis.title = element_text(size = 10, color = "gray10"),
    strip.text.y = element_blank()) 

14 genes are on the X chromosome but escape X chromosome inactivation and are activates post-meioticly.

CT_genes %>% 
  filter(X_linked) %>% 
  filter(testis_cell_type %in% c("Late_spermatocyte","Round_spermatid", 
                                 "Elongated_spermatid", "Sperm1", "Sperm2"))

3.4 Gene function

All CT genes function

msigdbr(species = "Homo sapiens" , category = "C5") %>% 
  filter(gene_symbol %in% CT_genes$external_gene_name) %>% 
  pull(gene_symbol) %>% 
  unique() %>% 
  length()
## [1] 184
msigdbr(species = "Homo sapiens" , category = "H") %>% 
  filter(gene_symbol %in% CT_genes$external_gene_name) %>% 
  pull(gene_symbol) %>% 
  unique() %>% 
  length()
## [1] 17
go_ora <- enrichGO(gene = CT_genes$ensembl_gene_id,
                   keyType = "ENSEMBL",
                   OrgDb = org.Hs.eg.db,
                   ont = "all",
                   readable = TRUE)
as_tibble(go_ora)
as_tibble(go_ora) %>% 
  arrange(desc(Count)) %>% 
  head(12) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/177,
                           ONTOLOGY == "CC"~ Count/197,
                           ONTOLOGY == "MF"~ Count/186)) %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  theme(axis.text.y = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = 10, color = "gray10"))+ 
  geom_text(aes(0, y = Description, label = Description),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 4)

ora_to_plot <- as_tibble(simplify(go_ora)) 

# Combine the two cilium movement from BP
ora_to_plot[ora_to_plot$ID == "GO:0003341", "Count"] <- 
  ora_to_plot[ora_to_plot$ID == "GO:0003341", "Count"] +
  ora_to_plot[ora_to_plot$ID == "GO:0060294", "Count"]

ora_to_plot <- ora_to_plot %>% 
  arrange(desc(Count)) %>% 
  head(11) %>% 
  filter(ID != "GO:0060294") %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/177,
                           ONTOLOGY == "CC"~ Count/197,
                           ONTOLOGY == "MF"~ Count/186))
  
       
ora_to_plot %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  theme(axis.text.y = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = 10, color = "gray10"))+ 
  geom_text(aes(0, y = Description, label = Description),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 3.7)

As we can see here, most of genes are indeed linked to functions from reproduction. I represented here the 12 categories with the most genes, all from biological processes. However, they are enriched in 89 different GO terms.

Is there a difference between meth reg or not ?

go_ora_meth <- enrichGO(gene = 
                          filter(CT_genes, regulated_by_methylation)$ensembl_gene_id,
                        keyType = "ENSEMBL",
                        OrgDb = org.Hs.eg.db,
                        ont = "all",
                        readable = TRUE)
go_ora_meth <- simplify(go_ora_meth)


go_ora_not_meth <- enrichGO(gene = 
                          filter(CT_genes, !regulated_by_methylation)$ensembl_gene_id,
                        keyType = "ENSEMBL",
                        OrgDb = org.Hs.eg.db,
                        ont = "all",
                        readable = TRUE)

go_ora_not_meth <- simplify(go_ora_not_meth)
as_tibble(go_ora_not_meth)
go_ora_meth_plot <- as_tibble(go_ora_meth) %>% 
  arrange(desc(Count)) %>% 
  head(11) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/101,
                           ONTOLOGY == "MF"~ Count/107)) %>%  
  mutate(regulation = "Methylation")

go_ora_not_meth_plot <- as_tibble(go_ora_not_meth) %>% 
  arrange(desc(Count)) %>% 
  head(11) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/76,
                           ONTOLOGY == "CC"~ Count/93))%>%  
  mutate(regulation = "Not methylation")


rbind(go_ora_meth_plot, go_ora_not_meth_plot) %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  facet_wrap(~ regulation) + 
  theme(legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 6, color = "gray10"),
        axis.title.x = element_text(size = 10, color = "gray10"),
        axis.title.y = element_blank())+ 
  geom_text(aes(0, y = Description, label = ID),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 2)

We’ve also added a column tumor suppressor and oncogene in CTexploreR. These information come from Cancermine.

table(CT_genes$oncogene) 
## 
## oncogene 
##       34
table(CT_genes$tumor_suppressor)
## 
## tumor_suppressor 
##               15
table(CT_genes$oncogene, CT_genes$tumor_suppressor) 
##           
##            tumor_suppressor
##   oncogene                9

4 SessionInfo

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Biostrings_2.70.1           XVector_0.42.0             
##  [3] patchwork_1.1.3             BiocParallel_1.36.0        
##  [5] DOSE_3.28.0                 msigdbr_7.5.1              
##  [7] clusterProfiler_4.10.0      org.Hs.eg.db_3.18.0        
##  [9] AnnotationDbi_1.64.1        SingleCellExperiment_1.24.0
## [11] circlize_0.4.15             ComplexHeatmap_2.18.0      
## [13] UpSetR_1.4.0                SummarizedExperiment_1.32.0
## [15] Biobase_2.62.0              GenomicRanges_1.54.1       
## [17] GenomeInfoDb_1.38.1         IRanges_2.36.0             
## [19] S4Vectors_0.40.1            MatrixGenerics_1.14.0      
## [21] matrixStats_1.1.0           lubridate_1.9.3            
## [23] forcats_1.0.0               stringr_1.5.1              
## [25] dplyr_1.1.3                 purrr_1.0.2                
## [27] tidyr_1.3.0                 tibble_3.2.1               
## [29] ggplot2_3.4.4               tidyverse_2.0.0            
## [31] biomaRt_2.58.0              Vennerable_3.0             
## [33] xtable_1.8-4                gtools_3.9.4               
## [35] reshape_0.8.9               RColorBrewer_1.1-3         
## [37] lattice_0.22-5              RBGL_1.78.0                
## [39] graph_1.80.0                BiocGenerics_0.48.1        
## [41] CTexploreR_0.99.5           CTdata_1.2.0               
## [43] readr_2.1.4                 readxl_1.4.3               
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.0                 later_1.3.1                  
##   [3] ggplotify_0.1.2               bitops_1.0-7                 
##   [5] filelock_1.0.2                cellranger_1.1.0             
##   [7] polyclip_1.10-6               XML_3.99-0.14                
##   [9] lifecycle_1.0.4               doParallel_1.0.17            
##  [11] vroom_1.6.4                   MASS_7.3-60                  
##  [13] magrittr_2.0.3                sass_0.4.7                   
##  [15] rmarkdown_2.25                jquerylib_0.1.4              
##  [17] yaml_2.3.7                    httpuv_1.6.12                
##  [19] cowplot_1.1.1                 DBI_1.1.3                    
##  [21] abind_1.4-5                   zlibbioc_1.48.0              
##  [23] ggraph_2.1.0                  RCurl_1.98-1.13              
##  [25] yulab.utils_0.1.0             tweenr_2.0.2                 
##  [27] rappdirs_0.3.3                GenomeInfoDbData_1.2.11      
##  [29] enrichplot_1.22.0             ggrepel_0.9.4                
##  [31] tidytree_0.4.5                codetools_0.2-19             
##  [33] DelayedArray_0.28.0           xml2_1.3.5                   
##  [35] ggforce_0.4.1                 tidyselect_1.2.0             
##  [37] shape_1.4.6                   aplot_0.2.2                  
##  [39] farver_2.1.1                  viridis_0.6.4                
##  [41] BiocFileCache_2.10.1          jsonlite_1.8.7               
##  [43] GetoptLong_1.0.5              ellipsis_0.3.2               
##  [45] tidygraph_1.2.3               iterators_1.0.14             
##  [47] foreach_1.5.2                 tools_4.3.0                  
##  [49] progress_1.2.2                treeio_1.26.0                
##  [51] HPO.db_0.99.2                 Rcpp_1.0.11                  
##  [53] glue_1.6.2                    gridExtra_2.3                
##  [55] SparseArray_1.2.2             xfun_0.41                    
##  [57] qvalue_2.34.0                 withr_2.5.2                  
##  [59] BiocManager_1.30.22           fastmap_1.1.1                
##  [61] fansi_1.0.5                   digest_0.6.33                
##  [63] gridGraphics_0.5-1            timechange_0.2.0             
##  [65] R6_2.5.1                      mime_0.12                    
##  [67] colorspace_2.1-0              Cairo_1.6-1                  
##  [69] GO.db_3.18.0                  RSQLite_2.3.3                
##  [71] utf8_1.2.4                    generics_0.1.3               
##  [73] data.table_1.14.8             prettyunits_1.2.0            
##  [75] graphlayouts_1.0.1            httr_1.4.7                   
##  [77] S4Arrays_1.2.0                scatterpie_0.2.1             
##  [79] pkgconfig_2.0.3               gtable_0.3.4                 
##  [81] blob_1.2.4                    shadowtext_0.1.2             
##  [83] htmltools_0.5.7               fgsea_1.28.0                 
##  [85] clue_0.3-65                   scales_1.2.1                 
##  [87] png_0.1-8                     ggfun_0.1.3                  
##  [89] knitr_1.45                    rstudioapi_0.15.0            
##  [91] tzdb_0.4.0                    reshape2_1.4.4               
##  [93] rjson_0.2.21                  nlme_3.1-163                 
##  [95] curl_5.1.0                    cachem_1.0.8                 
##  [97] GlobalOptions_0.1.2           BiocVersion_3.18.0           
##  [99] parallel_4.3.0                HDO.db_0.99.1                
## [101] pillar_1.9.0                  vctrs_0.6.4                  
## [103] promises_1.2.1                dbplyr_2.4.0                 
## [105] cluster_2.1.4                 evaluate_0.23                
## [107] magick_2.8.1                  cli_3.6.1                    
## [109] compiler_4.3.0                rlang_1.1.2                  
## [111] crayon_1.5.2                  labeling_0.4.3               
## [113] fs_1.6.3                      plyr_1.8.9                   
## [115] stringi_1.8.1                 viridisLite_0.4.2            
## [117] babelgene_22.9                MPO.db_0.99.7                
## [119] munsell_0.5.0                 lazyeval_0.2.2               
## [121] GOSemSim_2.28.0               Matrix_1.6-1.1               
## [123] ExperimentHub_2.10.0          hms_1.1.3                    
## [125] bit64_4.0.5                   KEGGREST_1.42.0              
## [127] shiny_1.7.5.1                 highr_0.10                   
## [129] interactiveDisplayBase_1.40.0 AnnotationHub_3.10.0         
## [131] igraph_1.5.1                  memoise_2.0.1                
## [133] bslib_0.5.1                   ggtree_3.10.0                
## [135] fastmatch_1.1-4               bit_4.0.5                    
## [137] gson_0.1.0                    ape_5.7-1